Author: Marah Shahin
ID: 500055421
UPI: msha846
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tsfresh
from IPython.display import Image
sns.set()
%matplotlib inline
%load_ext autoreload
%autoreload 2
swimDF = pd.read_csv('swimGood.csv',parse_dates=['time'],index_col=[0],header=0,names=['time','ax','ay','az','|a|'])
fbDF = pd.read_csv('passing.csv',parse_dates=['time'],index_col=[0],header=0,names=['time','ax','ay','az','|a|'])
swimDF.shape, fbDF.shape
((16172, 4), (14607, 4))
swimDF.head(), fbDF.head()
( ax ay az |a|
time
2020-09-23 13:41:41.704 -0.41 -0.04 0.24 0.47
2020-09-23 13:41:41.714 -0.27 0.46 -0.17 0.55
2020-09-23 13:41:41.725 -0.16 0.29 -0.84 0.90
2020-09-23 13:41:41.735 0.01 -0.12 -1.09 1.09
2020-09-23 13:41:41.745 0.06 -0.18 -0.84 0.86,
ax ay az |a|
time
2020-09-24 12:30:03.214 -0.22 1.23 -0.17 1.26
2020-09-24 12:30:03.224 -0.37 0.44 -0.53 0.78
2020-09-24 12:30:03.260 -0.34 -1.00 -0.54 1.18
2020-09-24 12:30:03.261 -0.38 -0.80 -0.03 0.88
2020-09-24 12:30:03.262 -0.41 -0.29 0.11 0.51)
The first activity was swimming. The lane was 25 metres in length and the swimming style was backstroke. The phone was mounted on the left arm facing leftward from the subject (figure below). The phone was secured vertically and the subject experienced little movement from the phone throughout the activity.
Image(filename = "meSwim.jpg")
The next activity was football. Specifically, the subject was passing a ball back and forth for just over 2 minutes. The phone was mounted in the same location as that of the first activity to avoid unecessary differences between data collections. A snapshot of the activity and phone can be seen in the figure below.
Image(filename='meBall2.jpg')
swimDF.plot(title ='Swimming Accelerometer Signal')
fbDF.plot(title ='Football Accelerometer Signal')
<matplotlib.axes._subplots.AxesSubplot at 0x1bfd9a00>
It's clear football results in an overall greater variance of magnitude throughout the activity. It's also interesting that the transition period for swimming is predominately greater than the football activity signal (possibly due to getting out of the pool, getting the band off, etc). The transition period for football is of a much smaller magnitude (a person ended the recording for me). Nevertheless, these periods are clear and will be removed in the next steps. Just manually, the signals vary in magnitude and patterns. The signals are likely to result in a solid, reliable classifying model.
# Create timeline vector
delta_tSwim = swimDF.index
delta_tSwim = delta_tSwim - delta_tSwim[0]
delta_tFB = fbDF.index
delta_tFB = delta_tFB - delta_tFB[0]
delta_tSwim, delta_tFB
(TimedeltaIndex([ '00:00:00', '00:00:00.010000', '00:00:00.021000',
'00:00:00.031000', '00:00:00.041000', '00:00:00.078000',
'00:00:00.079000', '00:00:00.080000', '00:00:00.082000',
'00:00:00.091000',
...
'00:02:42.203000', '00:02:42.205000', '00:02:42.206000',
'00:02:42.207000', '00:02:42.217000', '00:02:42.228000',
'00:02:42.236000', '00:02:42.246000', '00:02:42.256000',
'00:02:42.266000'],
dtype='timedelta64[ns]', name='time', length=16172, freq=None),
TimedeltaIndex([ '00:00:00', '00:00:00.010000', '00:00:00.046000',
'00:00:00.047000', '00:00:00.048000', '00:00:00.050000',
'00:00:00.060000', '00:00:00.070000', '00:00:00.080000',
'00:00:00.090000',
...
'00:02:26.449000', '00:02:26.459000', '00:02:26.469000',
'00:02:26.479000', '00:02:26.489000', '00:02:26.528000',
'00:02:26.529000', '00:02:26.530000', '00:02:26.532000',
'00:02:26.539000'],
dtype='timedelta64[ns]', name='time', length=14607, freq=None))
swimDF['delta_t']=delta_tSwim
fbDF['delta_t']=delta_tFB
swimDF.head(),fbDF.head()
( ax ay az |a| delta_t
time
2020-09-23 13:41:41.704 -0.41 -0.04 0.24 0.47 00:00:00
2020-09-23 13:41:41.714 -0.27 0.46 -0.17 0.55 00:00:00.010000
2020-09-23 13:41:41.725 -0.16 0.29 -0.84 0.90 00:00:00.021000
2020-09-23 13:41:41.735 0.01 -0.12 -1.09 1.09 00:00:00.031000
2020-09-23 13:41:41.745 0.06 -0.18 -0.84 0.86 00:00:00.041000,
ax ay az |a| delta_t
time
2020-09-24 12:30:03.214 -0.22 1.23 -0.17 1.26 00:00:00
2020-09-24 12:30:03.224 -0.37 0.44 -0.53 0.78 00:00:00.010000
2020-09-24 12:30:03.260 -0.34 -1.00 -0.54 1.18 00:00:00.046000
2020-09-24 12:30:03.261 -0.38 -0.80 -0.03 0.88 00:00:00.047000
2020-09-24 12:30:03.262 -0.41 -0.29 0.11 0.51 00:00:00.048000)
#Swimming
swimDF['seconds'] = delta_tSwim.seconds +delta_tSwim.microseconds/1e6
swimDF2 = swimDF.reset_index()
swimDF3 = swimDF2.drop(swimDF2.index[0:40])
swimDF4 = swimDF3.drop(swimDF3.index[-2000:])
swimDF4.plot(title ='Swimming Pruned Signal', y='|a|',x='delta_t')
#Football
fbDF['seconds'] = delta_tFB.seconds +delta_tFB.microseconds/1e6
fbDF2 = fbDF.reset_index()
fbDF3 = fbDF2.drop(fbDF2.index[0:600])
fbDF4 = fbDF3.drop(fbDF3.index[-500:])
fbDF4.plot(title ='Football Pruned Signal', y='|a|',x='delta_t')
<matplotlib.axes._subplots.AxesSubplot at 0x1e9fa748>
swimDF4=swimDF4.reset_index(drop=True)
swimDF4.head()
| time | ax | ay | az | |a| | delta_t | seconds | |
|---|---|---|---|---|---|---|---|
| 0 | 2020-09-23 13:41:42.107 | 0.07 | -0.07 | -0.25 | 0.26 | 00:00:00.403000 | 0.403 |
| 1 | 2020-09-23 13:41:42.116 | 0.06 | 0.10 | -0.23 | 0.25 | 00:00:00.412000 | 0.412 |
| 2 | 2020-09-23 13:41:42.126 | 0.00 | 0.26 | -0.08 | 0.27 | 00:00:00.422000 | 0.422 |
| 3 | 2020-09-23 13:41:42.136 | -0.02 | 0.20 | 0.04 | 0.20 | 00:00:00.432000 | 0.432 |
| 4 | 2020-09-23 13:41:42.146 | -0.02 | -0.06 | -0.04 | 0.07 | 00:00:00.442000 | 0.442 |
fbDF4 = fbDF4.reset_index(drop=True)
fbDF4.head()
| time | ax | ay | az | |a| | delta_t | seconds | |
|---|---|---|---|---|---|---|---|
| 0 | 2020-09-24 12:30:09.233 | 0.78 | -0.58 | -3.04 | 3.19 | 00:00:06.019000 | 6.019 |
| 1 | 2020-09-24 12:30:09.243 | 0.79 | -0.49 | -3.09 | 3.22 | 00:00:06.029000 | 6.029 |
| 2 | 2020-09-24 12:30:09.283 | 0.81 | -0.48 | -3.11 | 3.24 | 00:00:06.069000 | 6.069 |
| 3 | 2020-09-24 12:30:09.285 | 0.85 | -0.60 | -3.08 | 3.25 | 00:00:06.071000 | 6.071 |
| 4 | 2020-09-24 12:30:09.286 | 0.86 | -0.78 | -3.00 | 3.21 | 00:00:06.072000 | 6.072 |
The signals are now 142 (first sample is at 0.4s) and 135 (141-6) seconds for swimming and football activites, respectively.
swimWindow_inx = np.asarray(np.floor(swimDF4.index/100).values,np.int)
swimWindow_inx
array([ 0, 0, 0, ..., 141, 141, 141])
fbWindow_inx = np.asarray(np.floor(fbDF4.index/100).values,np.int)
fbWindow_inx
array([ 0, 0, 0, ..., 135, 135, 135])
swimDF4['window_idx'] = ['s{:03}'.format(idx) for idx in swimWindow_inx]
swimDF4.tail()
| time | ax | ay | az | |a| | delta_t | seconds | window_idx | |
|---|---|---|---|---|---|---|---|---|
| 14127 | 2020-09-23 13:44:03.842 | -0.70 | 0.81 | -0.83 | 1.35 | 00:02:22.138000 | 142.138 | s141 |
| 14128 | 2020-09-23 13:44:03.852 | -0.42 | 0.53 | -1.06 | 1.25 | 00:02:22.148000 | 142.148 | s141 |
| 14129 | 2020-09-23 13:44:03.862 | -0.15 | 0.22 | -1.24 | 1.26 | 00:02:22.158000 | 142.158 | s141 |
| 14130 | 2020-09-23 13:44:03.872 | 0.13 | -0.03 | -1.42 | 1.42 | 00:02:22.168000 | 142.168 | s141 |
| 14131 | 2020-09-23 13:44:03.882 | 0.24 | -0.34 | -1.65 | 1.70 | 00:02:22.178000 | 142.178 | s141 |
fbDF4['window_idx'] = ['fb{:03}'.format(idx) for idx in fbWindow_inx]
fbDF4.tail()
| time | ax | ay | az | |a| | delta_t | seconds | window_idx | |
|---|---|---|---|---|---|---|---|---|
| 13502 | 2020-09-24 12:32:24.728 | -0.24 | -2.63 | 1.42 | 2.99 | 00:02:21.514000 | 141.514 | fb135 |
| 13503 | 2020-09-24 12:32:24.729 | 0.11 | -0.70 | 1.58 | 1.73 | 00:02:21.515000 | 141.515 | fb135 |
| 13504 | 2020-09-24 12:32:24.730 | 0.53 | 1.53 | 1.55 | 2.24 | 00:02:21.516000 | 141.516 | fb135 |
| 13505 | 2020-09-24 12:32:24.745 | 0.82 | 3.45 | 1.35 | 3.79 | 00:02:21.531000 | 141.531 | fb135 |
| 13506 | 2020-09-24 12:32:24.746 | 0.76 | 4.47 | 1.15 | 4.67 | 00:02:21.532000 | 141.532 | fb135 |
swimDF4.groupby('window_idx').count()
| time | ax | ay | az | |a| | delta_t | seconds | |
|---|---|---|---|---|---|---|---|
| window_idx | |||||||
| s000 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s001 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s002 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s003 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s004 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| s137 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s138 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s139 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s140 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s141 | 32 | 32 | 32 | 32 | 32 | 32 | 32 |
142 rows × 7 columns
fbDF4.groupby('window_idx').count()
| time | ax | ay | az | |a| | delta_t | seconds | |
|---|---|---|---|---|---|---|---|
| window_idx | |||||||
| fb000 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb001 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb002 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb003 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb004 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| fb131 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb132 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb133 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb134 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb135 | 7 | 7 | 7 | 7 | 7 | 7 | 7 |
136 rows × 7 columns
#remove windows with less than 100 samples
swimDF5 = swimDF4.drop(swimDF4.index[-32:])
swimDF5.groupby('window_idx').count()
| time | ax | ay | az | |a| | delta_t | seconds | |
|---|---|---|---|---|---|---|---|
| window_idx | |||||||
| s000 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s001 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s002 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s003 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s004 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| s136 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s137 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s138 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s139 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| s140 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
141 rows × 7 columns
fbDF5 = fbDF4.drop(fbDF4.index[-7:])
fbDF5.groupby('window_idx').count()
| time | ax | ay | az | |a| | delta_t | seconds | |
|---|---|---|---|---|---|---|---|
| window_idx | |||||||
| fb000 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb001 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb002 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb003 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb004 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| fb130 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb131 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb132 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb133 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| fb134 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
135 rows × 7 columns
#create one large dataframe
df = pd.concat([swimDF5,fbDF5],ignore_index=True)
df = df.set_index('time')
df.index.name = None
df2 = df.drop(columns=['delta_t'])
df2.head()
| ax | ay | az | |a| | seconds | window_idx | |
|---|---|---|---|---|---|---|
| 2020-09-23 13:41:42.107 | 0.07 | -0.07 | -0.25 | 0.26 | 0.403 | s000 |
| 2020-09-23 13:41:42.116 | 0.06 | 0.10 | -0.23 | 0.25 | 0.412 | s000 |
| 2020-09-23 13:41:42.126 | 0.00 | 0.26 | -0.08 | 0.27 | 0.422 | s000 |
| 2020-09-23 13:41:42.136 | -0.02 | 0.20 | 0.04 | 0.20 | 0.432 | s000 |
| 2020-09-23 13:41:42.146 | -0.02 | -0.06 | -0.04 | 0.07 | 0.442 | s000 |
There are 141 complete (with 100 rows) windows for swimming and 135 windows for football.
from tsfresh.feature_extraction import extract_features
X = extract_features(df2, column_id='window_idx', column_sort='seconds')
Feature Extraction: 100%|██████████| 20/20 [00:44<00:00, 2.23s/it]
X.head()
| ax__variance_larger_than_standard_deviation | ax__has_duplicate_max | ax__has_duplicate_min | ax__has_duplicate | ax__sum_values | ax__abs_energy | ax__mean_abs_change | ax__mean_change | ax__mean_second_derivative_central | ax__median | ... | |a|__fourier_entropy__bins_2 | |a|__fourier_entropy__bins_3 | |a|__fourier_entropy__bins_5 | |a|__fourier_entropy__bins_10 | |a|__fourier_entropy__bins_100 | |a|__permutation_entropy__dimension_3__tau_1 | |a|__permutation_entropy__dimension_4__tau_1 | |a|__permutation_entropy__dimension_5__tau_1 | |a|__permutation_entropy__dimension_6__tau_1 | |a|__permutation_entropy__dimension_7__tau_1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| fb000 | 1.0 | 0.0 | 0.0 | 1.0 | -94.81 | 477.9639 | 0.313131 | -0.034343 | 0.001429 | -0.840 | ... | 0.223718 | 0.319026 | 0.413917 | 0.612670 | 1.383152 | 1.248731 | 1.818217 | 2.329479 | 2.793903 | 3.215116 |
| fb001 | 1.0 | 0.0 | 0.0 | 1.0 | -108.94 | 968.7772 | 0.576465 | 0.033838 | 0.003265 | -0.465 | ... | 0.096509 | 0.192626 | 0.192626 | 0.356468 | 0.919223 | 1.295477 | 1.970640 | 2.634906 | 3.208433 | 3.720443 |
| fb002 | 1.0 | 0.0 | 0.0 | 1.0 | 37.79 | 664.0563 | 0.529394 | -0.031414 | -0.003622 | 0.595 | ... | 0.274921 | 0.386735 | 0.537786 | 0.827416 | 1.577573 | 1.207890 | 1.754074 | 2.316838 | 2.881776 | 3.301884 |
| fb003 | 1.0 | 0.0 | 0.0 | 1.0 | -12.44 | 983.4576 | 0.576364 | 0.030909 | 0.004286 | -1.055 | ... | 0.165443 | 0.192626 | 0.288342 | 0.383650 | 0.973848 | 1.345547 | 2.005534 | 2.667043 | 3.240872 | 3.722407 |
| fb004 | 1.0 | 0.0 | 0.0 | 1.0 | -12.01 | 1158.7785 | 0.401414 | -0.039394 | 0.002806 | 0.685 | ... | 0.165443 | 0.329286 | 0.451359 | 0.668811 | 1.164305 | 1.220558 | 1.795744 | 2.294986 | 2.750332 | 3.092134 |
5 rows × 3136 columns
X.shape
(276, 3136)
The X feature matrix has a shape of 276 rows × 3136 columns with 3136 features extracted as per the number of columns in the 'X' dataframe above
#creat bool column for which activity is football and which is swimming
g = df2.groupby('window_idx')
windows=[]
TF = []
for name,group in g:
windows.append(name)
TF.append('s' in name)
d = {'window_idx': windows, 'col2': TF} #create dictionary
y = pd.DataFrame(data=d, index = windows) #create dataframe
y2 = y.drop(['window_idx'],axis=1)
y2.index.name = 'window_idx'
y3 = y2['col2']
y3
window_idx
fb000 False
fb001 False
fb002 False
fb003 False
fb004 False
...
s136 True
s137 True
s138 True
s139 True
s140 True
Name: col2, Length: 276, dtype: bool
from tsfresh.transformers import FeatureSelector
select = FeatureSelector()
select.fit(X.dropna(axis=1), y3)
FeatureSelector()
select.relevant_features
attr_"abs"__coeff_39', 'ay__fft_coefficient__attr_"abs"__coeff_40', 'az__fft_coefficient__attr_"abs"__coeff_14', 'ay__fft_coefficient__attr_"abs"__coeff_31', 'az__fft_coefficient__attr_"abs"__coeff_10', 'az__quantile__q_0.1', 'ay__fft_coefficient__attr_"abs"__coeff_48', 'az__fft_coefficient__attr_"abs"__coeff_3', 'ax__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.8', 'ax__fft_coefficient__attr_"abs"__coeff_10', '|a|__fft_coefficient__attr_"abs"__coeff_1', 'ax__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"min"', 'ax__fft_coefficient__attr_"abs"__coeff_17', 'ay__fft_coefficient__attr_"abs"__coeff_13', 'ay__fft_coefficient__attr_"abs"__coeff_44', '|a|__fft_coefficient__attr_"abs"__coeff_28', 'az__spkt_welch_density__coeff_5', 'az__fft_coefficient__attr_"abs"__coeff_6', 'az__fft_coefficient__attr_"abs"__coeff_9', 'az__fft_coefficient__attr_"abs"__coeff_2', 'az__fft_coefficient__attr_"abs"__coeff_5', '|a|__spkt_welch_density__coeff_8', 'az__minimum', 'ay__fft_coefficient__attr_"abs"__coeff_47', 'ay__spkt_welch_density__coeff_5', 'ax__fft_coefficient__attr_"abs"__coeff_20', 'az__fft_coefficient__attr_"abs"__coeff_17', 'ax__fft_coefficient__attr_"abs"__coeff_26', 'ay__fft_coefficient__attr_"abs"__coeff_46', 'ax__fft_coefficient__attr_"abs"__coeff_14', 'az__fft_coefficient__attr_"abs"__coeff_11', '|a|__fft_coefficient__attr_"abs"__coeff_30', '|a|__fft_coefficient__attr_"abs"__coeff_33', 'ay__fft_coefficient__attr_"abs"__coeff_37', 'ax__fft_coefficient__attr_"abs"__coeff_1', 'ay__variance_larger_than_standard_deviation', 'ax__fft_coefficient__attr_"abs"__coeff_3', 'ax__fft_coefficient__attr_"abs"__coeff_23', 'ay__fft_coefficient__attr_"abs"__coeff_5', 'ax__fft_coefficient__attr_"abs"__coeff_28', 'az__spkt_welch_density__coeff_2', 'ay__fft_coefficient__attr_"abs"__coeff_50', 'az__fft_coefficient__attr_"abs"__coeff_12', 'ay__fft_coefficient__attr_"abs"__coeff_49', 'az__fft_coefficient__attr_"abs"__coeff_16', 'az__fft_coefficient__attr_"abs"__coeff_13', 'az__variance_larger_than_standard_deviation', 'ay__quantile__q_0.3', 'ax__fft_coefficient__attr_"abs"__coeff_30', 'ax__fft_coefficient__attr_"abs"__coeff_21', 'az__spkt_welch_density__coeff_8', 'az__fft_coefficient__attr_"abs"__coeff_18', 'ax__fft_coefficient__attr_"abs"__coeff_22', 'az__quantile__q_0.8', 'ax__spkt_welch_density__coeff_8', 'ax__fft_coefficient__attr_"abs"__coeff_27', '|a|__fft_coefficient__attr_"abs"__coeff_39', '|a|__fft_coefficient__attr_"abs"__coeff_40', '|a|__fft_coefficient__attr_"abs"__coeff_2', 'az__fft_coefficient__attr_"abs"__coeff_15', 'ay__ar_coefficient__coeff_4__k_10', 'ax__fft_coefficient__attr_"abs"__coeff_29', 'ax__fft_coefficient__attr_"abs"__coeff_16', '|a|__fft_coefficient__attr_"abs"__coeff_31', 'ax__fft_coefficient__attr_"abs"__coeff_34', '|a|__fft_coefficient__attr_"abs"__coeff_38', 'ax__fft_coefficient__attr_"abs"__coeff_13', 'az__change_quantiles__f_agg_"mean"__isabs_True__qh_0.2__ql_0.0', 'az__fft_coefficient__attr_"abs"__coeff_27', 'ax__fft_coefficient__attr_"abs"__coeff_15', '|a|__fft_coefficient__attr_"abs"__coeff_45', 'ax__fft_coefficient__attr_"abs"__coeff_12', 'az__fft_coefficient__attr_"abs"__coeff_20', 'ay__ar_coefficient__coeff_3__k_10', '|a|__fft_coefficient__attr_"abs"__coeff_36', '|a|__fft_coefficient__attr_"abs"__coeff_32', '|a|__fft_coefficient__attr_"abs"__coeff_35', '|a|__fft_coefficient__attr_"abs"__coeff_34', '|a|__fft_coefficient__attr_"abs"__coeff_37', 'ax__quantile__q_0.8', 'az__fft_coefficient__attr_"abs"__coeff_23', 'az__change_quantiles__f_agg_"var"__isabs_False__qh_0.2__ql_0.0', 'ax__fft_coefficient__attr_"abs"__coeff_25', 'ax__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"max"', 'ax__fft_coefficient__attr_"abs"__coeff_33', '|a|__fft_coefficient__attr_"abs"__coeff_42', 'ax__fft_coefficient__attr_"abs"__coeff_36', '|a|__fft_coefficient__attr_"abs"__coeff_43', 'az__fft_coefficient__attr_"abs"__coeff_21', 'ax__fft_coefficient__attr_"abs"__coeff_35', 'az__fft_coefficient__attr_"abs"__coeff_33', 'az__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"min"', 'ax__fft_coefficient__attr_"abs"__coeff_37', '|a|__fft_coefficient__attr_"abs"__coeff_44', 'az__fft_coefficient__attr_"abs"__coeff_19', '|a|__fft_coefficient__attr_"abs"__coeff_41', 'az__quantile__q_0.2', '|a|__spkt_welch_density__coeff_2', 'ax__fft_coefficient__attr_"abs"__coeff_38', 'ax__fft_coefficient__attr_"abs"__coeff_0', 'az__fft_coefficient__attr_"abs"__coeff_32', '|a|__cwt_coefficients__coeff_14__w_20__widths_(2, 5, 10, 20)', 'az__change_quantiles__f_agg_"var"__isabs_True__qh_0.2__ql_0.0', 'az__fft_coefficient__attr_"abs"__coeff_26', 'ax__fft_coefficient__attr_"abs"__coeff_32', '|a|__fft_coefficient__attr_"abs"__coeff_47', 'ax__fft_coefficient__attr_"abs"__coeff_40', 'az__fft_coefficient__attr_"abs"__coeff_1', '|a|__cwt_coefficients__coeff_13__w_20__widths_(2, 5, 10, 20)', 'ax__fft_coefficient__attr_"abs"__coeff_39', 'ax__spkt_welch_density__coeff_2', 'az__fft_coefficient__attr_"abs"__coeff_34', 'ax__fft_coefficient__attr_"abs"__coeff_31', 'az__fft_coefficient__attr_"abs"__coeff_30', '|a|__fft_coefficient__attr_"abs"__coeff_48', 'az__fft_coefficient__attr_"abs"__coeff_28', '|a|__fft_coefficient__attr_"abs"__coeff_49', 'ax__quantile__q_0.3', 'az__fft_coefficient__attr_"abs"__coeff_37', 'ax__fft_coefficient__attr_"abs"__coeff_42', 'az__fft_coefficient__attr_"abs"__coeff_22', '|a|__fft_coefficient__attr_"abs"__coeff_46', '|a|__cwt_coefficients__coeff_12__w_20__widths_(2, 5, 10, 20)', 'az__fft_coefficient__attr_"abs"__coeff_36', 'az__fft_coefficient__attr_"abs"__coeff_35', 'ax__fft_coefficient__attr_"abs"__coeff_44', 'ax__fft_coefficient__attr_"abs"__coeff_41', 'ax__fft_coefficient__attr_"abs"__coeff_43', 'az__fft_coefficient__attr_"abs"__coeff_24', 'ax__fft_coefficient__attr_"abs"__coeff_46', 'az__fft_coefficient__attr_"abs"__coeff_31', '|a|__cwt_coefficients__coeff_11__w_20__widths_(2, 5, 10, 20)', 'az__number_crossing_m__m_1', 'az__fft_coefficient__attr_"abs"__coeff_25', 'az__fft_coefficient__attr_"abs"__coeff_38', 'ax__fft_coefficient__attr_"abs"__coeff_47', 'ax__fft_coefficient__attr_"abs"__coeff_48', '|a|__cwt_coefficients__coeff_5__w_5__widths_(2, 5, 10, 20)', 'az__fft_coefficient__attr_"abs"__coeff_29', 'ay__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"var"', '|a|__ar_coefficient__coeff_4__k_10', 'ax__fft_coefficient__attr_"abs"__coeff_49', 'az__fft_coefficient__attr_"abs"__coeff_40', '|a|__cwt_coefficients__coeff_2__w_2__widths_(2, 5, 10, 20)', 'ay__ar_coefficient__coeff_5__k_10', 'ay__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"var"', '|a|__cwt_coefficients__coeff_10__w_20__widths_(2, 5, 10, 20)', 'az__fft_coefficient__attr_"abs"__coeff_39', 'ax__fft_coefficient__attr_"abs"__coeff_45', 'az__fft_coefficient__attr_"abs"__coeff_46', 'az__fft_coefficient__attr_"abs"__coeff_43', 'ay__number_crossing_m__m_1', '|a|__cwt_coefficients__coeff_11__w_10__widths_(2, 5, 10, 20)', '|a|__cwt_coefficients__coeff_10__w_10__widths_(2, 5, 10, 20)', '|a|__cwt_coefficients__coeff_6__w_5__widths_(2, 5, 10, 20)', 'ax__fft_coefficient__attr_"abs"__coeff_2', '|a|__cwt_coefficients__coeff_12__w_10__widths_(2, 5, 10, 20)', '|a|__cwt_coefficients__coeff_9__w_10__widths_(2, 5, 10, 20)', 'az__fft_coefficient__attr_"abs"__coeff_42', '|a|__ar_coefficient__coeff_3__k_10', '|a|__cwt_coefficients__coeff_4__w_5__widths_(2, 5, 10, 20)', 'az__quantile__q_0.3', 'az__fft_coefficient__attr_"abs"__coeff_47', 'az__fft_coefficient__attr_"abs"__coeff_45', 'az__fft_coefficient__attr_"abs"__coeff_44', '|a|__cwt_coefficients__coeff_8__w_10__widths_(2, 5, 10, 20)', '|a|__cwt_coefficients__coeff_7__w_10__widths_(2, 5, 10, 20)', 'ax__fft_coefficient__attr_"abs"__coeff_50', 'ay__fft_coefficient__attr_"abs"__coeff_0', '|a|__cwt_coefficients__coeff_9__w_20__widths_(2, 5, 10, 20)', '|a|__cwt_coefficients__coeff_6__w_10__widths_(2, 5, 10, 20)', 'az__fft_coefficient__attr_"abs"__coeff_49', 'az__fft_coefficient__attr_"abs"__coeff_48', 'ax__ar_coefficient__coeff_4__k_10', '|a|__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"var"', 'az__fft_coefficient__attr_"abs"__coeff_41', 'ax__number_crossing_m__m_-1', '|a|__cwt_coefficients__coeff_13__w_10__widths_(2, 5, 10, 20)', '|a|__fft_coefficient__attr_"abs"__coeff_50', '|a|__cwt_coefficients__coeff_8__w_20__widths_(2, 5, 10, 20)', '|a|__cwt_coefficients__coeff_3__w_2__widths_(2, 5, 10, 20)', 'az__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"var"', 'az__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"var"', 'az__number_crossing_m__m_-1', '|a|__cwt_coefficients__coeff_5__w_10__widths_(2, 5, 10, 20)', 'ax__ar_coefficient__coeff_3__k_10', 'az__fft_coefficient__attr_"abs"__coeff_50', '|a|__cwt_coefficients__coeff_1__w_2__widths_(2, 5, 10, 20)', '|a|__cwt_coefficients__coeff_7__w_20__widths_(2, 5, 10, 20)', 'ay__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"', 'ay__ar_coefficient__coeff_2__k_10', '|a|__cwt_coefficients__coeff_14__w_10__widths_(2, 5, 10, 20)', '|a|__benford_correlation', '|a|__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)', '|a|__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"var"', 'az__fft_coefficient__attr_"abs"__coeff_0', 'ax__variance_larger_than_standard_deviation', 'ay__number_crossing_m__m_-1', '|a|__cwt_coefficients__coeff_6__w_20__widths_(2, 5, 10, 20)', '|a|__cwt_coefficients__coeff_3__w_5__widths_(2, 5, 10, 20)', 'ay__ar_coefficient__coeff_6__k_10', 'ay__quantile__q_0.4', 'az__quantile__q_0.7', '|a|__cid_ce__normalize_True', '|a|__cwt_coefficients__coeff_4__w_10__widths_(2, 5, 10, 20)', 'ay__c3__lag_1', 'ay__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"min"', 'ax__quantile__q_0.7', '|a|__autocorrelation__lag_2', '|a|__cwt_coefficients__coeff_5__w_20__widths_(2, 5, 10, 20)', 'ax__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"var"', 'ay__skewness', '|a|__partial_autocorrelation__lag_1', '|a|__autocorrelation__lag_1', 'ax__number_crossing_m__m_1', '|a|__cwt_coefficients__coeff_4__w_20__widths_(2, 5, 10, 20)', '|a|__ar_coefficient__coeff_5__k_10', '|a|__autocorrelation__lag_3', '|a|__fft_aggregated__aggtype_"skew"', 'ay__time_reversal_asymmetry_statistic__lag_3', 'ax__ar_coefficient__coeff_2__k_10', 'ax__quantile__q_0.4', 'ay__ar_coefficient__coeff_7__k_10', 'ax__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"var"', '|a|__large_standard_deviation__r_0.2', '|a|__cwt_coefficients__coeff_3__w_20__widths_(2, 5, 10, 20)', 'ax__ar_coefficient__coeff_5__k_10', 'az__quantile__q_0.4', '|a|__cwt_coefficients__coeff_3__w_10__widths_(2, 5, 10, 20)', 'ay__quantile__q_0.6', '|a|__cwt_coefficients__coeff_4__w_2__widths_(2, 5, 10, 20)', '|a|__ar_coefficient__coeff_2__k_10', 'az__ar_coefficient__coeff_2__k_10', '|a|__fft_aggregated__aggtype_"kurtosis"', '|a|__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)', '|a|__autocorrelation__lag_4', '|a|__value_count__value_1', 'az__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"min"', '|a|__cwt_coefficients__coeff_2__w_20__widths_(2, 5, 10, 20)', 'ay__count_above_mean', 'ay__count_below_mean', 'ay__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"max"', 'ax__cid_ce__normalize_True', 'ay__time_reversal_asymmetry_statistic__lag_2', '|a|__ratio_beyond_r_sigma__r_3', 'ax__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"', '|a|__cwt_coefficients__coeff_2__w_5__widths_(2, 5, 10, 20)', '|a|__fourier_entropy__bins_100', 'ay__augmented_dickey_fuller__attr_"usedlag"__autolag_"AIC"', '|a|__fft_aggregated__aggtype_"centroid"', 'az__ar_coefficient__coeff_3__k_10', '|a|__binned_entropy__max_bins_10', 'ay__fft_aggregated__aggtype_"skew"', 'ay__ar_coefficient__coeff_8__k_10', 'ax__ar_coefficient__coeff_6__k_10', 'ay__change_quantiles__f_agg_"mean"__isabs_False__qh_0.8__ql_0.6', 'ax__autocorrelation__lag_7', 'ax__ar_coefficient__coeff_1__k_10', 'az__skewness', 'ax__autocorrelation__lag_6', 'az__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"', 'ax__autocorrelation__lag_8', 'ay__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.8', 'ax__autocorrelation__lag_5', 'ax__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"min"', '|a|__kurtosis', 'ay__change_quantiles__f_agg_"mean"__isabs_False__qh_0.8__ql_0.4', 'az__longest_strike_above_mean', '|a|__longest_strike_above_mean', 'az__ar_coefficient__coeff_1__k_10', 'ax__autocorrelation__lag_4', 'ax__autocorrelation__lag_9', 'ay__cid_ce__normalize_True', 'az__cid_ce__normalize_True', '|a|__ar_coefficient__coeff_6__k_10', 'ax__autocorrelation__lag_3', 'ay__fft_aggregated__aggtype_"centroid"', '|a|__autocorrelation__lag_5', 'ay__number_peaks__n_1', 'ax__autocorrelation__lag_2', '|a|__cwt_coefficients__coeff_2__w_10__widths_(2, 5, 10, 20)', 'az__count_below_mean', 'az__count_above_mean', '|a|__partial_autocorrelation__lag_3', 'ay__autocorrelation__lag_1', 'ay__partial_autocorrelation__lag_1', '|a|__cwt_coefficients__coeff_1__w_20__widths_(2, 5, 10, 20)', 'ay__lempel_ziv_complexity__bins_10', 'ay__ar_coefficient__coeff_10__k_10', '|a|__agg_autocorrelation__f_agg_"var"__maxlag_40', 'ay__approximate_entropy__m_2__r_0.3', 'ay__ratio_beyond_r_sigma__r_3', 'ax__number_peaks__n_10', 'ay__value_count__value_0', 'ay__fft_coefficient__attr_"angle"__coeff_0', 'az__autocorrelation__lag_3', 'az__autocorrelation__lag_2', 'ay__ar_coefficient__coeff_1__k_10', 'ay__autocorrelation__lag_2', 'ay__binned_entropy__max_bins_10', 'ay__number_peaks__n_3', 'az__permutation_entropy__dimension_3__tau_1', 'ay__change_quantiles__f_agg_"mean"__isabs_False__qh_0.8__ql_0.2', 'ay__change_quantiles__f_agg_"mean"__isabs_False__qh_0.6__ql_0.2', 'ay__partial_autocorrelation__lag_3', 'ay__fourier_entropy__bins_100', '|a|__partial_autocorrelation__lag_4', 'az__number_cwt_peaks__n_1', '|a|__large_standard_deviation__r_0.25', 'az__autocorrelation__lag_4', 'ay__ar_coefficient__coeff_9__k_10', 'az__change_quantiles__f_agg_"mean"__isabs_False__qh_0.8__ql_0.6', 'ay__change_quantiles__f_agg_"mean"__isabs_False__qh_0.6__ql_0.4', 'az__ar_coefficient__coeff_5__k_10', '|a|__autocorrelation__lag_6', 'az__ar_coefficient__coeff_4__k_10', 'az__permutation_entropy__dimension_4__tau_1', '|a|__augmented_dickey_fuller__attr_"usedlag"__autolag_"AIC"', 'ax__fourier_entropy__bins_100', 'ax__approximate_entropy__m_2__r_0.9', 'az__autocorrelation__lag_5', '|a|__ratio_beyond_r_sigma__r_1', 'ax__ar_coefficient__coeff_7__k_10', 'ax__value_count__value_0', 'ay__partial_autocorrelation__lag_2', 'ax__partial_autocorrelation__lag_1', 'ax__autocorrelation__lag_1', 'ay__number_cwt_peaks__n_1', 'az__autocorrelation__lag_6', 'az__autocorrelation__lag_1', 'az__partial_autocorrelation__lag_1', '|a|__number_peaks__n_3', '|a|__skewness', 'ax__fourier_entropy__bins_10', 'az__autocorrelation__lag_7', 'ay__time_reversal_asymmetry_statistic__lag_1', '|a|__cwt_coefficients__coeff_9__w_5__widths_(2, 5, 10, 20)', 'az__value_count__value_0', '|a|__change_quantiles__f_agg_"mean"__isabs_False__qh_0.8__ql_0.2', '|a|__change_quantiles__f_agg_"mean"__isabs_False__qh_0.8__ql_0.4', 'az__autocorrelation__lag_8', 'az__approximate_entropy__m_2__r_0.9', 'ax__agg_autocorrelation__f_agg_"var"__maxlag_40', '|a|__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.8', 'az__autocorrelation__lag_9', 'ay__c3__lag_2', 'az__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"min"', '|a|__autocorrelation__lag_7', '|a|__ratio_beyond_r_sigma__r_2.5', 'ay__autocorrelation__lag_3', 'ay__fourier_entropy__bins_3', '|a|__ratio_beyond_r_sigma__r_1.5', 'ay__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"min"', 'ax__number_cwt_peaks__n_1', 'ay__approximate_entropy__m_2__r_0.5', 'az__change_quantiles__f_agg_"mean"__isabs_False__qh_0.6__ql_0.2', '|a|__lempel_ziv_complexity__bins_5', '|a|__sum_of_reoccurring_values', 'ay__sum_values', 'ay__mean', 'ay__fft_coefficient__attr_"real"__coeff_0', 'az__permutation_entropy__dimension_5__tau_1', 'ay__sample_entropy', 'az__approximate_entropy__m_2__r_0.7', '|a|__lempel_ziv_complexity__bins_10', 'az__c3__lag_1', 'az__time_reversal_asymmetry_statistic__lag_3', 'ay__change_quantiles__f_agg_"mean"__isabs_False__qh_0.4__ql_0.2', 'az__fourier_entropy__bins_100', '|a|__cwt_coefficients__coeff_0__w_20__widths_(2, 5, 10, 20)', 'ay__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.6', 'ax__fft_coefficient__attr_"real"__coeff_0', 'ax__sum_values', 'ax__mean', 'ay__number_peaks__n_5', 'ay__has_duplicate_max', 'az__ar_coefficient__coeff_6__k_10', '|a|__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.6', 'ay__cwt_coefficients__coeff_8__w_2__widths_(2, 5, 10, 20)', '|a|__ar_coefficient__coeff_7__k_10', 'ax__approximate_entropy__m_2__r_0.7', '|a|__number_peaks__n_5', 'ay__ratio_beyond_r_sigma__r_1.5', '|a|__partial_autocorrelation__lag_2', '|a|__autocorrelation__lag_8', 'ax__benford_correlation', 'ay__fourier_entropy__bins_2', 'az__fft_coefficient__attr_"angle"__coeff_0', 'ax__ratio_beyond_r_sigma__r_3', 'ay__ar_coefficient__coeff_0__k_10', 'ay__fft_aggregated__aggtype_"kurtosis"', '|a|__lempel_ziv_complexity__bins_100', 'ay__large_standard_deviation__r_0.2', 'az__change_quantiles__f_agg_"mean"__isabs_False__qh_0.8__ql_0.4', '|a|__fft_aggregated__aggtype_"variance"', 'ay__lempel_ziv_complexity__bins_100', '|a|__autocorrelation__lag_9', 'ay__lempel_ziv_complexity__bins_5', 'ay__benford_correlation', '|a|__approximate_entropy__m_2__r_0.9', '|a|__change_quantiles__f_agg_"mean"__isabs_False__qh_0.6__ql_0.2', '|a|__fourier_entropy__bins_10', 'ay__cwt_coefficients__coeff_5__w_2__widths_(2, 5, 10, 20)', 'az__fft_aggregated__aggtype_"centroid"', 'ax__fourier_entropy__bins_5', 'az__median', 'ay__cwt_coefficients__coeff_9__w_2__widths_(2, 5, 10, 20)', 'ax__fourier_entropy__bins_3', 'az__ratio_beyond_r_sigma__r_3', '|a|__cwt_coefficients__coeff_1__w_10__widths_(2, 5, 10, 20)', 'az__permutation_entropy__dimension_6__tau_1', 'az__fft_aggregated__aggtype_"skew"', 'ay__change_quantiles__f_agg_"mean"__isabs_False__qh_0.8__ql_0.0', 'ay__variation_coefficient', '|a|__cwt_coefficients__coeff_1__w_5__widths_(2, 5, 10, 20)', 'ay__cwt_coefficients__coeff_7__w_2__widths_(2, 5, 10, 20)', 'ay__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"mean"', 'ax__ratio_beyond_r_sigma__r_1.5', 'ax__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"max"', '|a|__sum_of_reoccurring_data_points', 'ax__c3__lag_3', 'az__approximate_entropy__m_2__r_0.5', 'ay__has_duplicate', 'az__change_quantiles__f_agg_"mean"__isabs_False__qh_0.4__ql_0.2', 'az__change_quantiles__f_agg_"mean"__isabs_False__qh_0.6__ql_0.4', 'ay__cwt_coefficients__coeff_6__w_2__widths_(2, 5, 10, 20)', 'ay__kurtosis', 'az__number_peaks__n_3', 'az__approximate_entropy__m_2__r_0.1', 'ax__fft_coefficient__attr_"imag"__coeff_1', '|a|__cwt_coefficients__coeff_10__w_5__widths_(2, 5, 10, 20)', 'ax__median', 'ay__lempel_ziv_complexity__bins_3', 'az__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"max"', 'az__number_crossing_m__m_0', 'az__number_peaks__n_5', 'ay__ratio_beyond_r_sigma__r_2.5', 'ax__approximate_entropy__m_2__r_0.5', 'ay__number_cwt_peaks__n_5', 'az__time_reversal_asymmetry_statistic__lag_2', 'az__change_quantiles__f_agg_"mean"__isabs_False__qh_0.8__ql_0.2', 'ax__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"min"', 'ay__autocorrelation__lag_9', 'ax__ar_coefficient__coeff_0__k_10', 'az__has_duplicate_max', 'ax__ar_coefficient__coeff_8__k_10', 'ay__value_count__value_1', 'ay__autocorrelation__lag_4', 'ay__fft_aggregated__aggtype_"variance"', 'ax__agg_linear_trend__attr_"rvalue"__chunk_len_10__f_agg_"min"', 'ax__ratio_beyond_r_sigma__r_2.5']
p_values = pd.Series(select.p_values, index= select.features)
p_values[select.relevant_features]
|a|__fft_coefficient__attr_"real"__coeff_0 9.581794e-47
|a|__mean 9.581794e-47
|a|__fft_coefficient__attr_"abs"__coeff_0 9.581794e-47
|a|__sum_values 9.581794e-47
|a|__abs_energy 1.045313e-46
...
ay__value_count__value_1 1.916598e-03
ay__autocorrelation__lag_4 2.004167e-03
ay__fft_aggregated__aggtype_"variance" 2.024619e-03
ax__agg_linear_trend__attr_"rvalue"__chunk_len_10__f_agg_"min" 2.076591e-03
ax__ratio_beyond_r_sigma__r_2.5 2.152619e-03
Length: 841, dtype: float64
select.relevant_features[:5]
['|a|__fft_coefficient__attr_"real"__coeff_0', '|a|__mean', '|a|__fft_coefficient__attr_"abs"__coeff_0', '|a|__sum_values', '|a|__abs_energy']
from sklearn.ensemble import RandomForestClassifier
# Random forest
clf = RandomForestClassifier(max_features=5)
clf.fit(X[select.relevant_features], y3)
RandomForestClassifier(max_features=5)
from sklearn.pipeline import make_pipeline
model = make_pipeline(FeatureSelector(), RandomForestClassifier())
from sklearn.model_selection import RepeatedKFold
kf = RepeatedKFold(n_splits=10, n_repeats=10)
Xd = X.dropna(axis=1)
predictions = list()
truth = list()
for train, test in kf.split(Xd):
model.fit(Xd.iloc[train], y3.iloc[train])
predictions.append(model.predict_proba(Xd.iloc[test]))
truth.append(y3.iloc[test])
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, Xd, y3, cv=kf, scoring='roc_auc')
scores
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
Scores do not vary with changes in test and training sets. Hence, the model does not appear overfit.
p = np.concatenate(predictions)
t = np.concatenate(truth)
p.shape, t.shape
((2760, 2), (2760,))
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(t, p[:,1])
plt.plot(fpr, tpr)
[<matplotlib.lines.Line2D at 0x257cce08>]
roc_auc_score(t,p[:,1])
0.9999259259259259
The score is very close to 1.0 therefore, this is a very good classifier.
Xi = X[['|a|__fft_coefficient__attr_"real"__coeff_0','|a|__mean','|a|__fft_coefficient__attr_"abs"__coeff_0','|a|__sum_values','|a|__abs_energy']]
Xi["activity"] = y3
sns.pairplot(Xi,hue = "activity") #where true is swimming
<seaborn.axisgrid.PairGrid at 0x24e42aa8>
The orange and blue dots are reasonably distinct in the majority of the plots. This could explain why the model does well in distinguishing between the two signals.
visualising some common features
from tsfresh.feature_extraction.settings import MinimalFCParameters
minimal_fc = MinimalFCParameters()
minimal = sorted(minimal_fc.keys())
minimal
['length', 'maximum', 'mean', 'median', 'minimum', 'standard_deviation', 'sum_values', 'variance']
minimal_fc
{'sum_values': None,
'median': None,
'mean': None,
'length': None,
'standard_deviation': None,
'variance': None,
'maximum': None,
'minimum': None}
mod_minimal_fc = minimal_fc.copy()
del mod_minimal_fc['standard_deviation']
mod_minimal_fc
{'sum_values': None,
'median': None,
'mean': None,
'length': None,
'variance': None,
'maximum': None,
'minimum': None}
#check out JUST |a| signal
df3 = df2.drop(['ax','az','ay'],axis=1)
df3.head()
| |a| | seconds | window_idx | |
|---|---|---|---|
| 2020-09-23 13:41:42.107 | 0.26 | 0.403 | s000 |
| 2020-09-23 13:41:42.116 | 0.25 | 0.412 | s000 |
| 2020-09-23 13:41:42.126 | 0.27 | 0.422 | s000 |
| 2020-09-23 13:41:42.136 | 0.20 | 0.432 | s000 |
| 2020-09-23 13:41:42.146 | 0.07 | 0.442 | s000 |
X4 = extract_features(df3, column_id='window_idx', column_sort='seconds', default_fc_parameters=mod_minimal_fc)
Feature Extraction: 100%|██████████| 20/20 [00:02<00:00, 9.01it/s]
X4.head()
| |a|__sum_values | |a|__median | |a|__mean | |a|__length | |a|__variance | |a|__maximum | |a|__minimum | |
|---|---|---|---|---|---|---|---|
| fb000 | 309.44 | 2.910 | 3.0944 | 100.0 | 1.350395 | 7.89 | 0.63 |
| fb001 | 575.17 | 5.205 | 5.7517 | 100.0 | 12.327392 | 14.23 | 0.71 |
| fb002 | 824.87 | 7.555 | 8.2487 | 100.0 | 15.486413 | 23.30 | 1.44 |
| fb003 | 591.43 | 5.450 | 5.9143 | 100.0 | 8.962543 | 13.73 | 1.35 |
| fb004 | 427.34 | 3.780 | 4.2734 | 100.0 | 9.798202 | 21.34 | 0.95 |
X4["activity"] = y3
sns.pairplot(X4,hue = "activity") #where true is swimming
<seaborn.axisgrid.PairGrid at 0x1bf95fe8>
The orange and blue dots are reasonably distinct in the majority of the plots. This could explain why the model does well in distinguishing between the two signals.